Code
library(tidyverse)
library(rstan)
library(scales)
library(glue)
library(knitr)
theme_set(theme_minimal())
options(dplyr.summarise.inform = FALSE)
This document demonstrates the Buckley’s & None Australian election forecasting model. It includes the stan
code that defines the model and uses data in the lead up to 2019 election to demonstrate how it would have performed in that election.
This model is built by Martin Burgess and is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
Read in the data and package it up so that it can be input into the stan
model.
polls <- read_csv("data/polls2019.csv") |>
mutate(firm_index = as.numeric(as.factor(firm_coarse)))
all_elections <- read_csv("data/election_results2019.csv")
polls_elections <- all_elections |>
filter(!is.na(day_index))
crossbench <- read_csv("data/crossbench2019.csv")
model_input <- list(
# timeseries length
n_days = max(polls$day_index),
# priors,
tpp_mu_prior_mu = mean(all_elections$tpp_alp),
tpp_mu_prior_scale = sd(all_elections$tpp_alp),
tpp_sigma_prior_mu = 0.0025,
tpp_sigma_prior_scale = 0.001,
tpp_coef_prior_scale = 0.05,
df_prior_alpha = 32/3,
df_prior_beta = 8/3,
# election data for polling
polls_n_elections = nrow(polls_elections),
polls_election_days = as.array(polls_elections$day_index),
polls_election_tpp_alp = as.array(polls_elections$tpp_alp),
# polling data
polls_n = nrow(polls),
polls_firms_n = length(unique(polls$firm_index)),
polls_firm = polls$firm_index,
polls_tpp_alp = polls$tpp_alp,
polls_standard_error = polls$tpp_alp_se,
polls_day = polls$day_index,
polls_se_inflator = 2,
# election data for tpp2seats
tpp2seats_n_elections = nrow(all_elections),
tpp2seats_alpha_prior_mu = 0,
tpp2seats_alpha_prior_scale = 3,
tpp2seats_beta_prior_mu = 0,
tpp2seats_beta_prior_scale = 5,
tpp2seats_election_major_seats = all_elections$seats_alp + all_elections$seats_lnc,
tpp2seats_election_alp_seats = all_elections$seats_alp,
tpp2seats_election_tpp_alp = all_elections$tpp_alp,
# data for crossbench
crossbench_n_elections = nrow(crossbench),
crossbench_retain_shots = crossbench$crossbench_running,
crossbench_retained = crossbench$crossbench_retain,
crossbench_gain_shots = crossbench$gain_opportunities,
crossbench_gained = crossbench$crossbench_gain
)
Specify the stan
code that defines the model. This can be stored in a separate file but it has been included here so that everything is in one document.
The underlying two-party preferred voting intention model was inspired by a similar model for the 2019 election built by Peter Ellis. Key changes include incorporating a hyperparameter for the degrees of freedom used between days in the bayesian state space model.
stan_model <- "
data {
// timeseries length
int<lower = 1> n_days;
// priors
real<lower = 0, upper = 1> tpp_mu_prior_mu;
real<lower = 0> tpp_mu_prior_scale;
real<lower = 0> tpp_sigma_prior_mu;
real<lower = 0> tpp_sigma_prior_scale;
real<lower = 0> tpp_coef_prior_scale;
real<lower = 0> df_prior_alpha;
real<lower = 0> df_prior_beta;
// election data for polling
int polls_n_elections;
int polls_election_days[polls_n_elections]; // days on which elections occur
real polls_election_tpp_alp[polls_n_elections]; // historical election results
// polling data
// total number of polls
int<lower=1> polls_n;
// number of firms
int<lower=1> polls_firms_n;
// pollster index
int<lower=1,upper=polls_firms_n> polls_firm[polls_n];
// polling data
real<lower=0,upper=1> polls_tpp_alp[polls_n];
real<lower=0> polls_standard_error[polls_n];
int<lower=1> polls_day[polls_n];
//int<lower=1, upper=2> poll_close_to_next[polls_n];
// polling standard error inflator
real<lower=1> polls_se_inflator;
// election data for tpp2seats
int<lower=1> tpp2seats_n_elections;
real tpp2seats_alpha_prior_mu;
real<lower = 0> tpp2seats_alpha_prior_scale;
real tpp2seats_beta_prior_mu;
real<lower = 0> tpp2seats_beta_prior_scale;
int tpp2seats_election_major_seats[tpp2seats_n_elections];
int tpp2seats_election_alp_seats[tpp2seats_n_elections];
vector<lower=0, upper=1>[tpp2seats_n_elections] tpp2seats_election_tpp_alp;
// data for crossbench model
int<lower=1> crossbench_n_elections;
int<lower=0> crossbench_retain_shots[crossbench_n_elections];
int crossbench_retained [crossbench_n_elections];
int<lower=0> crossbench_gain_shots[crossbench_n_elections];
int crossbench_gained [crossbench_n_elections];
}
parameters {
// tpp_margin estimate
vector<lower=0, upper=1>[n_days] tpp_mu;
// tpp_margin sigma
real<lower=0> tpp_sigma;
// degrees of freedom between days
real<lower=1, upper=7> student_t_df;
// house effect
real<lower=-1, upper=1> house_effect[polls_firms_n];
// close to next effect
real<lower=-1, upper=1> polling_bias;
// tpp to seat
real alpha;
real beta;
// crossbench
real<lower=0, upper=1> crossbench_p_retain;
real<lower=0, upper=1> crossbench_p_gain;
}
transformed parameters {
// linear predictor
vector[tpp2seats_n_elections] tpp2seats_y_hat;
tpp2seats_y_hat = alpha + tpp2seats_election_tpp_alp * beta;
}
model {
// priors # make parameters for these
tpp_mu[1] ~ normal(tpp_mu_prior_mu, tpp_mu_prior_scale); // starting state space
tpp_sigma ~ normal(tpp_sigma_prior_mu, tpp_sigma_prior_scale); // prior for innovation sd.
house_effect ~ normal(0, tpp_coef_prior_scale); // ie a fairly loose prior for house effects (on scale of [0,1])
polling_bias ~ normal(0, tpp_coef_prior_scale);
student_t_df ~ gamma(df_prior_alpha, df_prior_beta);
// state model
tpp_mu[2:n_days] ~ student_t(student_t_df, tpp_mu[1:(n_days - 1)], tpp_sigma);
// historical election results
for(election_i in 1:polls_n_elections){
polls_election_tpp_alp[election_i] ~ normal(tpp_mu[polls_election_days[election_i]], 0.0001); // we know tpp_mu very accurately on election day
}
//polls
for(poll_i in 1:polls_n){
polls_tpp_alp[poll_i] ~ normal(tpp_mu[polls_day[poll_i]] + house_effect[polls_firm[poll_i]] + polling_bias, polls_standard_error[poll_i] * polls_se_inflator);
}
// //tpp2seats
alpha ~ normal(tpp2seats_alpha_prior_mu, tpp2seats_alpha_prior_scale);
beta ~ normal(tpp2seats_beta_prior_mu, tpp2seats_beta_prior_scale);
tpp2seats_election_alp_seats ~ binomial_logit(tpp2seats_election_major_seats, tpp2seats_y_hat);
// crossbench
crossbench_p_retain ~ beta(5,2);
crossbench_retained ~ binomial(crossbench_retain_shots, crossbench_p_retain);
crossbench_p_gain ~ beta(1,5);
crossbench_gained ~ binomial(crossbench_gain_shots, crossbench_p_gain);
}
"
Run the model. If you are running this yourself you may want to change the number of chains and cores and the number of iterations (per chain). I use 8 cores so this creates 80,000 samples and we use the second half of them.
Load samples generated by stan
, discarding the first half of samples as “burn in”.
# load samples ------------------------------------------------------------
samples <- tibble()
sample_files <- list.files(path = "output/",
pattern = "forecast_samples_",
full.names = TRUE)
for(file in sample_files){
temp <- read_csv(file,
skip = 25) |>
filter(!is.na(accept_stat__)) |>
slice(-1) |>
slice_tail(prop = 0.5) |>
select(-lp__)
samples <- bind_rows(samples,
temp)
}
# generated values --------------------------------------------------------
samples_no_ts <- samples |>
filter(!is.na(crossbench_p_retain)) |>
mutate(crossbench_retain = rbinom(n(), 7, crossbench_p_retain),
crossbench_gain = rbinom(n(), 151-7, crossbench_p_gain),
crossbench_total = crossbench_retain + crossbench_gain) |>
mutate(majors_seats = 151 - crossbench_total)
ts_start <- as.Date("2004-10-09") - 90
last_year_cols <- tibble(col_name = names(samples)) |>
filter(str_starts(col_name, "tpp_mu")) |>
separate(col = col_name,
into = c(NA, "day_index"),
sep = "\\.",
remove = FALSE) |>
mutate(day_index = as.numeric(day_index),
date = as.Date(day_index, origin = ts_start-1)) |>
filter(date > max(date) - 365)
samples_ts <- samples_no_ts |>
select(all_of(last_year_cols$col_name),
starts_with("house_effect"),
polling_bias,
alpha,
beta,
majors_seats,
crossbench_total) |>
pivot_longer(cols = starts_with("tpp_mu"),
names_to = "day",
values_to = "tpp_alp") |>
left_join(last_year_cols,
by = c("day" = "col_name")) |>
mutate(p_alp_win_seat = plogis(alpha + beta*tpp_alp)) |>
mutate(alp_seats = rbinom(n(), majors_seats, p_alp_win_seat),
lnc_seats = as.integer(majors_seats - alp_seats),
seat_margin = lnc_seats - alp_seats) |>
mutate(tpp_lnc = 1 - tpp_alp,
tpp_margin = tpp_lnc - tpp_alp)
samples_election_date <- samples_ts |>
filter(date == max(date))
rm(temp, samples, samples_no_ts)
Calculate proportion of samples for the three primary outcomes: ALP win, Liberal/National Coalition (LNC) win or minority government.
outcome | chance |
---|---|
ALP win | 16.4% |
Coalition win | 52.0% |
Minority government | 31.6% |
chances_ts <- samples_ts |>
filter(date > (max(date) - floor(365/2))) |> # last 6 months of data
group_by(date) |>
summarise(`ALP win` = mean(alp_seats >=76),
`Coalition win` = mean(lnc_seats >=76),
`Minority\ngovernment` = mean(lnc_seats <76 & alp_seats < 76)) |>
ungroup() |>
pivot_longer(cols = -c(date),
names_to = "outcome",
values_to = "chance") |>
select(date, outcome, chance) |>
mutate(outcome = fct_reorder2(outcome, date, chance))
chances_ts |>
ggplot(aes(x = date,
y = chance,
colour = outcome)) +
geom_line() +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
scale_x_date(date_labels = "%b '%y") +
labs(x = "",
y = "Chance of outcome",
colour = "")
x_lab <- function(x) {
l <- tibble(x = x) |>
mutate(p = case_when(x > 0 ~ " COA",
x < 0 ~ " ALP",
TRUE ~ "")) |>
mutate(lab = glue("+{abs(x)}{p}")) |>
pull(lab)
return(l)
}
samples_election_date |>
ggplot(aes(x = seat_margin)) +
geom_vline(xintercept = 0) +
geom_histogram(binwidth = 1,
alpha = 0.9) +
geom_vline(xintercept = 9,
colour = "red") +
labs(y = "More likely →",
x = "← ALP win more, Coalition win more →") +
scale_x_continuous(labels = x_lab) +
theme(axis.text.y = element_blank()) +
annotate("text",
x = 9+1,
y = Inf,
label = "2019 result",
colour = "red",
hjust = 0,
vjust = 1)
samples_election_date |>
count(alp_seats, lnc_seats, crossbench_total) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = alp_seats,
y = lnc_seats,
fill = prop)) +
geom_tile() +
geom_point(x = 68,
y = 76,
shape = 4,
size = 2,
alpha = 0.5,
colour = "red") +
scale_fill_continuous(labels = percent_format(accuracy = 0.1)) +
labs(y = "Coalition seats",
x = "ALP seats",
fill = "Chance") +
annotate("text",
x = 72,
y = 80,
colour = "red",
label = "2019 result",
hjust = 0,
vjust = 0)
seats_ts <- samples_ts |>
filter(date > max(date) - as.integer(365/2)) |>
pivot_longer(cols = c(alp_seats,
lnc_seats),
names_to = "party",
values_to = "seats") |>
select(date, party, seats) |>
mutate(colour = case_when(party == "alp_seats" ~ "ALP",
party == "lnc_seats" ~ "Coalition")) |>
group_by(date, party, colour) |>
summarise(seats_50 = quantile(seats, 0.5),
seats_10 = quantile(seats, 0.1),
seats_90 = quantile(seats, 0.9)) |>
mutate(across(starts_with("seats_"), ~round(.x))) |>
ungroup() |>
mutate(colour = fct_reorder2(colour, date, seats_50))
seats_ts |>
ggplot(aes(x = date,
y = seats_50,
ymin = seats_10,
ymax = seats_90,
colour = colour,
fill = colour)) +
geom_ribbon(alpha = 0.4,
colour = NA) +
geom_line() +
scale_x_date(date_labels = "%b '%y") +
labs(x = "",
y = "Seats won (80% CI)",
fill = "",
colour = "")
x_lab <- function(x) {
l <- tibble(x = x) |>
mutate(p = case_when(x > 0 ~ " COA",
x < 0 ~ " ALP",
TRUE ~ "")) |>
mutate(lab = glue("+{percent(abs(x), accuracy = 0.1)}{p}")) |>
pull(lab)
return(l)
}
samples_election_date |>
ggplot(aes(x = tpp_margin)) +
geom_vline(xintercept = 0) +
geom_density(alpha = 0.9,
fill = "grey",
colour = NA) +
geom_vline(xintercept = 0.0306,
colour = "red") +
scale_x_continuous(labels = x_lab,
limits = c(-0.09, 0.09)) +
theme(axis.text.y = element_blank()) +
labs(y = "More likely →",
x = "← ALP TPP higher, Coalition TPP higher →") +
annotate("text",
x = 0.0306 + 0.005,
y = Inf,
label = "2019 result",
colour = "red",
hjust = 0,
vjust = 1)
Warning: Removed 2 rows containing non-finite values (stat_density).
tpp_ts <- samples_ts |>
filter(date > max(date) - as.integer(365/2)) |>
pivot_longer(cols = c(tpp_alp,
tpp_lnc),
names_to = "party",
values_to = "tpp") |>
select(date, party, tpp) |>
mutate(colour = case_when(party == "tpp_alp" ~ "ALP",
party == "tpp_lnc" ~ "Coalition")) |>
group_by(date, party, colour) |>
summarise(tpp_50 = quantile(tpp, 0.5),
tpp_10 = quantile(tpp, 0.1),
tpp_90 = quantile(tpp, 0.9)) |>
ungroup() |>
mutate(colour = fct_reorder2(colour, date, tpp_50))
tpp_ts |>
ggplot(aes(x = date,
y = tpp_50,
ymin = tpp_10,
ymax = tpp_90,
colour = colour,
fill = colour)) +
geom_ribbon(alpha = 0.4,
colour = NA) +
geom_line() +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
scale_x_date(date_labels = "%b '%y") +
labs(x = "",
y = "Two-party preferred vote (80% CI)",
fill = "",
colour = "")